TODO: Add titles to graphs that are nice

First, I’ll pull the data set from my database.

taxi_raw <- query_results("
  SELECT *
 FROM trips
 INNER JOIN fares ON trips.id = fares.trip_id
 -- ORDER BY trips.hack_license, trips.medallion, trips.pickup_datetime
 LIMIT 1000000;

")

taxi_raw <- taxi_raw[,unique(colnames(taxi_raw))]

In order to better analyze the data, let’s divide the longitude/latitude pairings into boroughs (this is just an estimation, boundaries aren’t perfect):

taxi_raw <- taxi_raw %>% 
  mutate(pickup_borough = 
    ifelse(pickup_longitude> -74.01 & pickup_longitude< -73.927 & 
             pickup_latitude > 40.701 & pickup_latitude < 40.875, "manhattan", 
    ifelse(pickup_latitude<40.7 & pickup_longitude > -73.85 | 
             pickup_longitude > -73.96 & pickup_longitude < -73.92 & 
                pickup_latitude < 40.704 & pickup_latitude >40.739, "brooklyn", 
    ifelse(pickup_longitude> - 73.927 | pickup_latitude > 40.875, "bronx",
    ifelse(pickup_longitude < -74.04, "staten_island", "queens")))))

taxi_raw <- taxi_raw %>% 
  mutate(dropoff_borough = 
    ifelse(dropoff_longitude> -74.01 & dropoff_longitude< -73.927 & 
             dropoff_latitude > 40.701 & dropoff_latitude < 40.875, "manhattan", 
    ifelse(dropoff_latitude<40.7 & dropoff_longitude > -73.85 | 
             dropoff_longitude > -73.96 & dropoff_longitude < -73.92 & 
                dropoff_latitude < 40.704 & dropoff_latitude >40.739, "brooklyn", 
    ifelse(dropoff_longitude> - 73.927 | dropoff_latitude > 40.875, "bronx",
    ifelse(dropoff_longitude < -74.04, "staten_island", "queens")))))

taxi_raw <- taxi_raw %>% 
  mutate(
    medallion = as.factor(medallion),
    hack_license = as.factor(hack_license),
    vendor_id = as.factor(vendor_id),
    rate_code = as.factor(rate_code),
    pickup_datetime = ymd_hms(pickup_datetime),
    dropoff_datetime = ymd_hms(dropoff_datetime), # UTC incorrect
    payment_type = as.factor(payment_type),
    fare_amount = as.double(fare_amount),
    surcharge = as.double(surcharge),
    mta_tax = as.double(mta_tax),
    tip_amount = as.double(tip_amount),
    tolls_amount = as.double(tolls_amount),
    total_amount = as.double(total_amount),
    pickup_borough = as.factor(pickup_borough),
    dropoff_borough = as.factor(dropoff_borough),
    trip_time_in_secs = as.integer(trip_time_in_secs),
    trip_distance = as.double(trip_distance),
    passenger_count = as.integer(passenger_count)
  )

How many pickups per borough?

borough_total_pickups <- taxi_raw %>% 
  dplyr::filter(!is.na(pickup_borough) & !is.na(dropoff_borough)) %>% 
  group_by(pickup_borough) %>% 
  summarize(borough_pickups = n())

Let’s perform some EDA. First, let’s look into tipping. What’s the distribution of tip percentages?

TODO: Tried doing a ratio here but it didn’t work. Try again later?

## Warning in left_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector

How many trips per day of week?

## Source: local data frame [7 x 2]
## 
##     weekday total_rides
## 1    Friday      141134
## 2    Monday      114706
## 3  Saturday      138597
## 4    Sunday      120767
## 5  Thursday      169025
## 6   Tuesday      156586
## 7 Wednesday      159185

How many trips per date? We can see here the lull on January 21st (MLK Day). There’s also a dip on Janurary 6th, which is the Epiphany, but I’m not convinced that’s why we see less taxi traffic. Also puzzling is the peak on January 26th (Saturday).

Total revenue each day of the week? TODO: See if this is statistically significant

## Source: local data frame [7 x 2]
## 
##     weekday revenue
## 1  Thursday 2385540
## 2 Wednesday 2234333
## 3   Tuesday 2202680
## 4    Friday 1966782
## 5  Saturday 1846514
## 6    Sunday 1706065
## 7    Monday 1617159

Checking out the toll situation in the city. High tolls for Staten Island!

taxi %>% 
  dplyr::filter(tolls_amount>0) %>% 
  group_by(pickup_borough, dropoff_borough) %>% 
  summarize(num_rides_w_tolls = n(), average_toll = mean(tolls_amount)) %>% 
  ungroup() %>% 
  ungroup() %>% 
  arrange(desc(average_toll)) # can also order by number of toll rides
## Source: local data frame [25 x 4]
## 
##    pickup_borough dropoff_borough num_rides_w_tolls average_toll
## 1           bronx   staten_island                38    11.742105
## 2       manhattan   staten_island              1089    11.303012
## 3        brooklyn   staten_island                60    11.147500
## 4          queens   staten_island                98    10.745408
## 5   staten_island        brooklyn                11    10.540909
## 6   staten_island       manhattan                 3    10.366667
## 7   staten_island   staten_island                41     9.532439
## 8   staten_island          queens                 3     9.350000
## 9        brooklyn          queens               118     6.250000
## 10  staten_island           bronx                 4     6.162500
## ..            ...             ...               ...          ...

How about trip length?

taxi_length <- taxi %>% 
  select(trip_distance, trip_time_in_secs) %>% 
  dplyr::filter(trip_distance>0) 

taxi_length$trip_distance <- round(taxi_length$trip_distance)
taxi_length$trip_time_in_secs <- round(taxi_length$trip_time_in_secs, digits = -1)

ggplot(taxi_length, aes(x=trip_distance, y= trip_time_in_secs)) +
    geom_point(shape=19, alpha=1/10)

summary(taxi_length) 
##  trip_distance     trip_time_in_secs
##  Min.   :  0.000   Min.   :   0.0   
##  1st Qu.:  1.000   1st Qu.: 360.0   
##  Median :  2.000   Median : 560.0   
##  Mean   :  2.797   Mean   : 686.1   
##  3rd Qu.:  3.000   3rd Qu.: 890.0   
##  Max.   :100.000   Max.   :9970.0

Looks like the vast majority of trips are within 25 miles and 50 minutes. This is expected. Mean distance is 3.39 miles, mean time is about 13 minutes (802 seconds).

Can we find any trends with trips to airports? Perhaps there’s a flat rate fare. Here’s a histogram of fare amounts (not counting surcharges, tip, tolls, etc.).

Notice the peak at $52. Perhaps this is a flat rate?

taxi52 <- taxi %>% 
  dplyr::filter(fare_amount == 52)

ggplot(taxi52, aes(x = dropoff_longitude, y = dropoff_latitude)) + geom_point(alpha = 1/10) + 
  xlim(-74.2,-73.6) + ylim(40.5,41)
## Warning: Removed 636 rows containing missing values (geom_point).

Here we see two clusters. Let’s examine more closely.

# Checking out the cluster to the right
ggplot(taxi52, aes(x = dropoff_longitude, y = dropoff_latitude)) + geom_point(alpha = 1/10) + 
  xlim(-73.9,-73.7) + ylim(40.635,40.65) # JFK
## Warning: Removed 10983 rows containing missing values (geom_point).

ggplot(taxi52, aes(x = dropoff_longitude, y = dropoff_latitude)) + geom_point(alpha = 1/10) + 
  xlim(-74,-73.97) + ylim(40.745,40.775) # unclear
## Warning: Removed 12218 rows containing missing values (geom_point).

ggplot(taxi52, aes(x = pickup_longitude, y = pickup_latitude)) + geom_point(alpha = 1/10) + 
  xlim(-74.05,-73.75)+ ylim(40.5,41)
## Warning: Removed 559 rows containing missing values (geom_point).

taxi52_pickups <- taxi52 %>% 
  dplyr::filter(dropoff_longitude < -73.97 & dropoff_longitude > -74 & dropoff_latitude > 40.745 &
           dropoff_latitude < 40.775)

ggplot(taxi52_pickups, aes(x = pickup_longitude, y = pickup_latitude)) + geom_point(alpha = 1/10) +
  xlim(-75,-72.5) + ylim(40.5,41)
## Warning: Removed 9 rows containing missing values (geom_point).

And would you look at that! The cluster to the right is at approximately -73.78 longitude and 40.645 latitude, landing us right on top of JFK airport. If we look at the pickup data for the other point in the dropoff graph, we find that these customers were picked up at the airport and brough to various places in NYC. The internet confirms that this ride is a flat rate either way.

What if we look at general pickup/dropoff locations? Will we find any especially popular spots other than the airport?

ggplot(taxi, aes( x = pickup_longitude, y = pickup_latitude)) + geom_point(alpha = 1/500) + 
  xlim(-74.01,-73.85) + ylim(40.7,40.85) + theme_bw() # having trouble setting this alpha to the correct value. want to be able to pick out specific long/lat points. Cool map of Manhattan for now!
## Warning: Removed 69296 rows containing missing values (geom_point).

ggplot(taxi, aes( x = dropoff_longitude, y = dropoff_latitude)) + geom_point(alpha = 1/100) + xlim(-74.4,-73.4) + ylim(40.5,41) + theme_bw()
## Warning: Removed 18962 rows containing missing values (geom_point).

By zooming in on this graph and identifying landmarks based on longitude/latitude data, I was able to identify the following as popular pick up and drop off locations: * LaGuardia Airport * Times Square * Penn Station/MSG * Columbus Circle * Car Inspection Station in Brooklyn * Dial 7 car/limo service in Queens * Brooklyn Driving School * Eight Strings and a Whistle * Various auto shops

Now let’s try some machine learning on this data set.

tip_percentages <- taxi %>%
  select(rate_code
         , passenger_count
         , trip_time_in_secs
         , trip_distance
         , payment_type
         , fare_amount
         , surcharge
         , mta_tax
         , tolls_amount
         , total_amount
         , pickup_borough
         , tip_percentage
         )
tip_percentages <- na.omit(tip_percentages)

smp_size <- floor(0.75 * nrow(tip_percentages))
set.seed(123)
train_ind <- sample(seq_len(nrow(tip_percentages)), size = smp_size)

train <- tip_percentages[train_ind, ]
test <- tip_percentages[-train_ind, ]

rfit <- rpart(tip_percentage ~ . , data=train) 
prp(rfit)

Maybe a correlogram can shed some more light on the relationships between our variables.

taxi_num <- taxi %>% 
  dplyr::mutate(tip_percentage = tip_amount/fare_amount*100) %>% 
  select(vendor_id, rate_code, passenger_count, trip_time_in_secs, trip_distance, 
         pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude,
         fare_amount, surcharge, mta_tax, tip_amount, tolls_amount, total_amount,
         tip_percentage)

corrgram(taxi_num, order = TRUE, upper.panel = panel.pie)

Nothing too interesting here… the fare, trip time, trip distance, tip amount total amount, etc. are all positively correlated. The pickup longitude and latitude seem to only affect each other, which makes sense since in NYC longitudes are negative while latitudes are positive. Correlograms have been helpful in analyses of other data sets, but here this method just illuminates the lack of surprising correlations in our data.

Let’s take a look at trip efficiency. Here we’ll define efficiency as trip distance/trip time (this will give us the average mph of the trip).

taxi_efficiency <- taxi %>% 
  dplyr::filter(trip_distance>0, trip_time_in_secs>0) %>% 
  mutate(mph = trip_distance/(trip_time_in_secs/3600)) %>% 
  arrange(mph)

ggplot(data = taxi_efficiency, aes(x = pickup_hour, y = mph)) + 
  geom_point(alpha = 1/510) + ylim(0,30) + geom_smooth(se = F)
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## Warning: Removed 31976 rows containing missing values (stat_smooth).
## Warning: Removed 31976 rows containing missing values (geom_point).

Looks like the efficiency peaks at around 4:30am but remains pretty consistent from 7am-7pm. We see higher efficiency outside of work hours, as expected.

taxi_inefficient <- taxi_efficiency %>% 
  dplyr::filter(mph< 2, mph >.01, pickup_longitude!=0, pickup_latitude!=0)

ggplot(data = taxi_inefficient) + 
  geom_point(aes(y = pickup_latitude, x = pickup_longitude),color = "blue", alpha = 1/25) + 
  xlim(-74.02,-73.94) + ylim(40.7,40.8) + 
  geom_point(aes(x = dropoff_longitude, y = dropoff_latitude), color = "red", alpha = 1/25)
## Warning: Removed 326 rows containing missing values (geom_point).
## Warning: Removed 558 rows containing missing values (geom_point).

Perhaps we can use machine learning to predict the length of a trip.

efficient_ml <- taxi_efficiency %>% 
  mutate(trip_time_in_mins = trip_time_in_secs/60) %>% 
  dplyr::filter(dropoff_latitude<41, pickup_latitude<41) %>% 
  select(rate_code, pickup_date, pickup_hour, passenger_count, pickup_longitude, pickup_latitude,
         dropoff_latitude, dropoff_longitude, pickup_borough, dropoff_borough, payment_type,
         surcharge, tolls_amount,weekday, trip_time_in_mins)

fit <- rpart(trip_time_in_mins ~ ., data = efficient_ml, method = "anova")
prp(fit)